{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Please input your directory for the top level folder\n",
    "folder name : SUBMISSION MODEL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "dir_ = 'INPUT-PROJECT-DIRECTORY/submission_model/' # input only here"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_data_dir = dir_+'2. data/'\n",
    "processed_data_dir = dir_+'2. data/processed/'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "raw_data_dir = '/home/ec2-user/SageMaker/efs/Hilaf/M5/data/'\n",
    "processed_data_dir = '/home/ec2-user/SageMaker/efs/Hilaf/M5/data/processed/'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1. Main setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# General imports\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import os, sys, gc, time, warnings, pickle, psutil, random\n",
    "\n",
    "from math import ceil\n",
    "\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from tqdm import tqdm\n",
    "\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Simple \"Memory profilers\" to see memory usage\n",
    "def get_memory_usage():\n",
    "    return np.round(psutil.Process(os.getpid()).memory_info()[0]/2.**30, 2) \n",
    "        \n",
    "def sizeof_fmt(num, suffix='B'):\n",
    "    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:\n",
    "        if abs(num) < 1024.0:\n",
    "            return \"%3.1f%s%s\" % (num, unit, suffix)\n",
    "        num /= 1024.0\n",
    "    return \"%.1f%s%s\" % (num, 'Yi', suffix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Memory Reducer\n",
    "# :df pandas dataframe to reduce size             # type: pd.DataFrame()\n",
    "# :verbose                                        # type: bool\n",
    "def reduce_mem_usage(df, verbose=True):\n",
    "    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']\n",
    "    start_mem = df.memory_usage().sum() / 1024**2    \n",
    "    for col in df.columns:\n",
    "        col_type = df[col].dtypes\n",
    "        if col_type in numerics:\n",
    "            c_min = df[col].min()\n",
    "            c_max = df[col].max()\n",
    "            if str(col_type)[:3] == 'int':\n",
    "                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n",
    "                    df[col] = df[col].astype(np.int8)\n",
    "                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:\n",
    "                       df[col] = df[col].astype(np.int16)\n",
    "                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:\n",
    "                    df[col] = df[col].astype(np.int32)\n",
    "                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:\n",
    "                    df[col] = df[col].astype(np.int64)  \n",
    "            else:\n",
    "                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:\n",
    "                    df[col] = df[col].astype(np.float16)\n",
    "                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:\n",
    "                    df[col] = df[col].astype(np.float32)\n",
    "                else:\n",
    "                    df[col] = df[col].astype(np.float64)    \n",
    "    end_mem = df.memory_usage().sum() / 1024**2\n",
    "    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Merging by concat to not lose dtypes\n",
    "def merge_by_concat(df1, df2, merge_on):\n",
    "    merged_gf = df1[merge_on]\n",
    "    merged_gf = merged_gf.merge(df2, on=merge_on, how='left')\n",
    "    new_columns = [col for col in list(merged_gf) if col not in merge_on]\n",
    "    df1 = pd.concat([df1, merged_gf[new_columns]], axis=1)\n",
    "    return df1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "########################### Vars\n",
    "#################################################################################\n",
    "TARGET = 'sales'         # Our main target\n",
    "END_TRAIN = 1941         # Last day in train set\n",
    "MAIN_INDEX = ['id','d']  # We can identify item by these columns"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2. Part 1\n",
    "- Melting train data => grid_part_1\n",
    "- creating price features => grid_part_2\n",
    "- creating calendar features => grid_part_3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Load Main Data\n"
     ]
    }
   ],
   "source": [
    "########################### Load Data\n",
    "#################################################################################\n",
    "print('Load Main Data')\n",
    "\n",
    "# Here are reafing all our data \n",
    "# without any limitations and dtype modification\n",
    "train_df = pd.read_csv(raw_data_dir+'sales_train_evaluation.csv')\n",
    "prices_df = pd.read_csv(raw_data_dir+'sell_prices.csv')\n",
    "calendar_df = pd.read_csv(raw_data_dir+'calendar.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Create Grid\n",
      "Train rows: 30490 59181090\n",
      "    Original grid_df:   3.6GiB\n",
      "     Reduced grid_df:   1.3GiB\n"
     ]
    }
   ],
   "source": [
    "########################### Make Grid\n",
    "#################################################################################\n",
    "print('Create Grid')\n",
    "\n",
    "# We can tranform horizontal representation representation 바꾸기\n",
    "# to vertical \"view\"\n",
    "# Our \"index\" will be 'id','item_id','dept_id','cat_id','store_id','state_id'\n",
    "# and labels are 'd_' coulmns\n",
    "\n",
    "index_columns = ['id','item_id','dept_id','cat_id','store_id','state_id']\n",
    "grid_df = pd.melt(train_df, \n",
    "                  id_vars = index_columns, \n",
    "                  var_name = 'd', \n",
    "                  value_name = TARGET)\n",
    "\n",
    "# If we look on train_df we se that \n",
    "# we don't have a lot of traning rows\n",
    "# but each day can provide more train data\n",
    "print('Train rows:', len(train_df), len(grid_df))\n",
    "\n",
    "# To be able to make predictions\n",
    "# we need to add \"test set\" to our grid\n",
    "add_grid = pd.DataFrame()\n",
    "for i in range(1,29):\n",
    "    temp_df = train_df[index_columns]\n",
    "    temp_df = temp_df.drop_duplicates()\n",
    "    temp_df['d'] = 'd_'+ str(END_TRAIN+i)\n",
    "    temp_df[TARGET] = np.nan\n",
    "    add_grid = pd.concat([add_grid,temp_df])\n",
    "\n",
    "grid_df = pd.concat([grid_df,add_grid])\n",
    "grid_df = grid_df.reset_index(drop=True)\n",
    "\n",
    "# Remove some temoprary DFs\n",
    "del temp_df, add_grid\n",
    "\n",
    "# We will not need original train_df\n",
    "# anymore and can remove it\n",
    "del train_df\n",
    "\n",
    "# You don't have to use df = df construction\n",
    "# you can use inplace=True instead.\n",
    "# like this\n",
    "# grid_df.reset_index(drop=True, inplace=True)\n",
    "\n",
    "# Let's check our memory usage\n",
    "print(\"{:>20}: {:>8}\".format('Original grid_df',sizeof_fmt(grid_df.memory_usage(index=True).sum())))\n",
    "\n",
    "# We can free some memory \n",
    "# by converting \"strings\" to categorical\n",
    "# it will not affect merging and \n",
    "# we will not lose any valuable data\n",
    "for col in index_columns:\n",
    "    grid_df[col] = grid_df[col].astype('category')\n",
    "\n",
    "# Let's check again memory usage\n",
    "print(\"{:>20}: {:>8}\".format('Reduced grid_df',sizeof_fmt(grid_df.memory_usage(index=True).sum())))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Release week\n",
      "    Original grid_df:   1.8GiB\n",
      "     Reduced grid_df:   1.5GiB\n"
     ]
    }
   ],
   "source": [
    "########################### Product Release date\n",
    "#################################################################################\n",
    "print('Release week')\n",
    "\n",
    "# It seems that leadings zero values\n",
    "# in each train_df item row\n",
    "# are not real 0 sales but mean\n",
    "# absence for the item in the store\n",
    "# we can safe some memory by removing\n",
    "# such zeros\n",
    "\n",
    "# Prices are set by week\n",
    "# so it we will have not very accurate release week \n",
    "release_df = prices_df.groupby(['store_id','item_id'])['wm_yr_wk'].agg(['min']).reset_index()\n",
    "release_df.columns = ['store_id','item_id','release']\n",
    "\n",
    "# Now we can merge release_df\n",
    "grid_df = merge_by_concat(grid_df, release_df, ['store_id','item_id'])\n",
    "del release_df\n",
    "\n",
    "# We want to remove some \"zeros\" rows\n",
    "# from grid_df \n",
    "# to do it we need wm_yr_wk column\n",
    "# let's merge partly calendar_df to have it\n",
    "grid_df = merge_by_concat(grid_df, calendar_df[['wm_yr_wk','d']], ['d'])\n",
    "                      \n",
    "# Now we can cutoff some rows \n",
    "# and safe memory \n",
    "grid_df = grid_df[grid_df['wm_yr_wk']>=grid_df['release']]\n",
    "grid_df = grid_df.reset_index(drop=True)\n",
    "\n",
    "# Let's check our memory usage\n",
    "print(\"{:>20}: {:>8}\".format('Original grid_df',sizeof_fmt(grid_df.memory_usage(index=True).sum())))\n",
    "\n",
    "# Should we keep release week \n",
    "# as one of the features?\n",
    "# Only good CV can give the answer.\n",
    "# Let's minify the release values.\n",
    "# Min transformation will not help here \n",
    "# as int16 -> Integer (-32768 to 32767)\n",
    "# and our grid_df['release'].max() serves for int16\n",
    "# but we have have an idea how to transform \n",
    "# other columns in case we will need it\n",
    "grid_df['release'] = grid_df['release'] - grid_df['release'].min()\n",
    "grid_df['release'] = grid_df['release'].astype(np.int16)\n",
    "\n",
    "# Let's check again memory usage\n",
    "print(\"{:>20}: {:>8}\".format('Reduced grid_df',sizeof_fmt(grid_df.memory_usage(index=True).sum())))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Save Part 1\n",
      "Size: (47735397, 10)\n"
     ]
    }
   ],
   "source": [
    "########################### Save part 1\n",
    "#################################################################################\n",
    "print('Save Part 1')\n",
    "\n",
    "# We have our BASE grid ready\n",
    "# and can save it as pickle file\n",
    "# for future use (model training)\n",
    "grid_df.to_pickle(processed_data_dir+'grid_part_1.pkl')\n",
    "\n",
    "print('Size:', grid_df.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Prices\n"
     ]
    }
   ],
   "source": [
    "########################### Prices\n",
    "#################################################################################\n",
    "print('Prices')\n",
    "\n",
    "# We can do some basic aggregations\n",
    "prices_df['price_max'] = prices_df.groupby(['store_id','item_id'])['sell_price'].transform('max')\n",
    "prices_df['price_min'] = prices_df.groupby(['store_id','item_id'])['sell_price'].transform('min')\n",
    "prices_df['price_std'] = prices_df.groupby(['store_id','item_id'])['sell_price'].transform('std')\n",
    "prices_df['price_mean'] = prices_df.groupby(['store_id','item_id'])['sell_price'].transform('mean')\n",
    "\n",
    "# and do price normalization (min/max scaling)\n",
    "prices_df['price_norm'] = prices_df['sell_price']/prices_df['price_max']\n",
    "\n",
    "# Some items are can be inflation dependent\n",
    "# and some items are very \"stable\"\n",
    "\n",
    "prices_df['price_nunique'] = prices_df.groupby(['store_id','item_id'])['sell_price'].transform('nunique') \n",
    "prices_df['item_nunique'] = prices_df.groupby(['store_id','sell_price'])['item_id'].transform('nunique')\n",
    "\n",
    "# I would like some \"rolling\" aggregations\n",
    "# but would like months and years as \"window\"\n",
    "calendar_prices = calendar_df[['wm_yr_wk','month','year']]\n",
    "calendar_prices = calendar_prices.drop_duplicates(subset=['wm_yr_wk']) # distinct(.keep_all = True)\n",
    "prices_df = prices_df.merge(calendar_prices[['wm_yr_wk','month','year']], on=['wm_yr_wk'], how='left')\n",
    "del calendar_prices\n",
    "\n",
    "# Now we can add price \"momentum\" (some sort of)\n",
    "# Shifted by week \n",
    "# by month mean\n",
    "# by year mean\n",
    "prices_df['price_momentum'] = prices_df['sell_price']/prices_df.groupby(['store_id','item_id'])['sell_price'].transform(lambda x: x.shift(1))\n",
    "prices_df['price_momentum_m'] = prices_df['sell_price']/prices_df.groupby(['store_id','item_id','month'])['sell_price'].transform('mean')\n",
    "prices_df['price_momentum_y'] = prices_df['sell_price']/prices_df.groupby(['store_id','item_id','year'])['sell_price'].transform('mean')\n",
    "\n",
    "del prices_df['month'], prices_df['year']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mem. usage decreased to 1003.11 Mb (35.3% reduction)\n",
      "Mem. usage decreased to 313.16 Mb (60.0% reduction)\n"
     ]
    }
   ],
   "source": [
    "grid_df = reduce_mem_usage(grid_df)\n",
    "prices_df = reduce_mem_usage(prices_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Merge prices and save part 2\n",
      "Mem. usage decreased to 1822.44 Mb (0.0% reduction)\n",
      "Size: (47735397, 13)\n"
     ]
    }
   ],
   "source": [
    "########################### Merge prices and save part 2\n",
    "#################################################################################\n",
    "print('Merge prices and save part 2')\n",
    "\n",
    "# Merge Prices\n",
    "original_columns = list(grid_df)\n",
    "grid_df = grid_df.merge(prices_df, on=['store_id','item_id','wm_yr_wk'], how='left')\n",
    "keep_columns = [col for col in list(grid_df) if col not in original_columns]\n",
    "grid_df = grid_df[MAIN_INDEX+keep_columns]\n",
    "grid_df = reduce_mem_usage(grid_df)\n",
    "\n",
    "# Safe part 2\n",
    "grid_df.to_pickle(processed_data_dir+'grid_part_2.pkl')\n",
    "print('Size:', grid_df.shape)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# We don't need prices_df anymore\n",
    "del prices_df, grid_df\n",
    "\n",
    "# We can remove new columns\n",
    "# or just load part_1\n",
    "grid_df = pd.read_pickle(processed_data_dir+'grid_part_1.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "########################### Merge calendar\n",
    "#################################################################################\n",
    "grid_df = grid_df[MAIN_INDEX]\n",
    "\n",
    "# Merge calendar partly\n",
    "icols = ['date',\n",
    "         'd',\n",
    "         'event_name_1',\n",
    "         'event_type_1',\n",
    "         'event_name_2',\n",
    "         'event_type_2',\n",
    "         'snap_CA',\n",
    "         'snap_TX',\n",
    "         'snap_WI']\n",
    "\n",
    "grid_df = grid_df.merge(calendar_df[icols], on=['d'], how='left')\n",
    "\n",
    "# Minify data\n",
    "# 'snap_' columns we can convert to bool or int8\n",
    "icols = ['event_name_1',\n",
    "         'event_type_1',\n",
    "         'event_name_2',\n",
    "         'event_type_2',\n",
    "         'snap_CA',\n",
    "         'snap_TX',\n",
    "         'snap_WI']\n",
    "for col in icols:\n",
    "    grid_df[col] = grid_df[col].astype('category')\n",
    "\n",
    "# Convert to DateTime\n",
    "grid_df['date'] = pd.to_datetime(grid_df['date'])\n",
    "\n",
    "# Make some features from date\n",
    "grid_df['tm_d'] = grid_df['date'].dt.day.astype(np.int8)\n",
    "grid_df['tm_w'] = grid_df['date'].dt.week.astype(np.int8)\n",
    "grid_df['tm_m'] = grid_df['date'].dt.month.astype(np.int8)\n",
    "grid_df['tm_y'] = grid_df['date'].dt.year\n",
    "grid_df['tm_y'] = (grid_df['tm_y'] - grid_df['tm_y'].min()).astype(np.int8)\n",
    "grid_df['tm_wm'] = grid_df['tm_d'].apply(lambda x: ceil(x/7)).astype(np.int8) # 오늘 몇째주?\n",
    "\n",
    "grid_df['tm_dw'] = grid_df['date'].dt.dayofweek.astype(np.int8) \n",
    "grid_df['tm_w_end'] = (grid_df['tm_dw']>=5).astype(np.int8)\n",
    "\n",
    "# Remove date\n",
    "del grid_df['date']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Save part 3\n",
      "Size: (47735397, 16)\n"
     ]
    }
   ],
   "source": [
    "########################### Save part 3 (Dates)\n",
    "#################################################################################\n",
    "print('Save part 3')\n",
    "\n",
    "# Safe part 3\n",
    "grid_df.to_pickle(processed_data_dir+'grid_part_3.pkl')\n",
    "print('Size:', grid_df.shape)\n",
    "\n",
    "# We don't need calendar_df anymore\n",
    "del calendar_df\n",
    "del grid_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "########################### Some additional cleaning\n",
    "#################################################################################\n",
    "\n",
    "## Part 1\n",
    "# Convert 'd' to int\n",
    "grid_df = pd.read_pickle(processed_data_dir+'grid_part_1.pkl')\n",
    "grid_df['d'] = grid_df['d'].apply(lambda x: x[2:]).astype(np.int16)\n",
    "\n",
    "# Remove 'wm_yr_wk'\n",
    "# as test values are not in train set\n",
    "del grid_df['wm_yr_wk']\n",
    "grid_df.to_pickle(processed_data_dir+'grid_part_1.pkl')\n",
    "\n",
    "del grid_df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 3. Part2\n",
    "- Lag featrue\n",
    "- Lag rolling feature"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Create lags\n",
      "7.96 min: Lags\n",
      "Create rolling aggs\n",
      "Rolling period: 7\n",
      "Rolling period: 14\n",
      "Rolling period: 30\n",
      "Rolling period: 60\n",
      "Rolling period: 180\n",
      "Shifting period: 1\n",
      "Shifting period: 7\n",
      "Shifting period: 14\n",
      "19.06 min: Lags\n"
     ]
    }
   ],
   "source": [
    "grid_df = pd.read_pickle(processed_data_dir+'grid_part_1.pkl')\n",
    "\n",
    "# We need only 'id','d','sales'\n",
    "# to make lags and rollings\n",
    "grid_df = grid_df[['id','d','sales']]\n",
    "SHIFT_DAY = 28\n",
    "\n",
    "# Lags\n",
    "# with 28 day shift\n",
    "start_time = time.time()\n",
    "print('Create lags')\n",
    "\n",
    "LAG_DAYS = [col for col in range(SHIFT_DAY,SHIFT_DAY+15)]\n",
    "grid_df = grid_df.assign(**{\n",
    "        '{}_lag_{}'.format(col, l): grid_df.groupby(['id'])[col].transform(lambda x: x.shift(l))\n",
    "        for l in LAG_DAYS\n",
    "        for col in [TARGET]\n",
    "    })\n",
    "\n",
    "# Minify lag columns\n",
    "for col in list(grid_df):\n",
    "    if 'lag' in col:\n",
    "        grid_df[col] = grid_df[col].astype(np.float16)\n",
    "\n",
    "print('%0.2f min: Lags' % ((time.time() - start_time) / 60))\n",
    "\n",
    "# Rollings\n",
    "# with 28 day shift\n",
    "start_time = time.time()\n",
    "print('Create rolling aggs')\n",
    "\n",
    "for i in [7,14,30,60,180]:\n",
    "    print('Rolling period:', i)\n",
    "    grid_df['rolling_mean_'+str(i)] = grid_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(SHIFT_DAY).rolling(i).mean()).astype(np.float16)\n",
    "    grid_df['rolling_std_'+str(i)]  = grid_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(SHIFT_DAY).rolling(i).std()).astype(np.float16)\n",
    "\n",
    "# Rollings\n",
    "# with sliding shift\n",
    "for d_shift in [1,7,14]: \n",
    "    print('Shifting period:', d_shift)\n",
    "    for d_window in [7,14,30,60]:\n",
    "        col_name = 'rolling_mean_tmp_'+str(d_shift)+'_'+str(d_window)\n",
    "        grid_df[col_name] = grid_df.groupby(['id'])[TARGET].transform(lambda x: x.shift(d_shift).rolling(d_window).mean()).astype(np.float16)\n",
    "    \n",
    "    \n",
    "print('%0.2f min: Lags' % ((time.time() - start_time) / 60))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Save lags and rollings\n"
     ]
    }
   ],
   "source": [
    "########################### Export\n",
    "#################################################################################\n",
    "print('Save lags and rollings')\n",
    "grid_df.to_pickle(processed_data_dir+'lags_df_'+str(SHIFT_DAY)+'.pkl')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 4. Part3\n",
    "- Mean encoding feature"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Encoding ['state_id']\n",
      "Encoding ['store_id']\n",
      "Encoding ['cat_id']\n",
      "Encoding ['dept_id']\n",
      "Encoding ['state_id', 'cat_id']\n",
      "Encoding ['state_id', 'dept_id']\n",
      "Encoding ['store_id', 'cat_id']\n",
      "Encoding ['store_id', 'dept_id']\n",
      "Encoding ['item_id']\n",
      "Encoding ['item_id', 'state_id']\n",
      "Encoding ['item_id', 'store_id']\n"
     ]
    }
   ],
   "source": [
    "########################### Apply on grid_df\n",
    "#################################################################################\n",
    "# lets read grid from \n",
    "# https://www.kaggle.com/kyakovlev/m5-simple-fe\n",
    "# to be sure that our grids are aligned by index\n",
    "grid_df = pd.read_pickle(processed_data_dir+'grid_part_1.pkl')\n",
    "grid_df['sales'][grid_df['d']>(1941-28)] = np.nan\n",
    "base_cols = list(grid_df)\n",
    "\n",
    "icols =  [\n",
    "            ['state_id'],\n",
    "            ['store_id'],\n",
    "            ['cat_id'],\n",
    "            ['dept_id'],\n",
    "            ['state_id', 'cat_id'],\n",
    "            ['state_id', 'dept_id'],\n",
    "            ['store_id', 'cat_id'],\n",
    "            ['store_id', 'dept_id'],\n",
    "            ['item_id'],\n",
    "            ['item_id', 'state_id'],\n",
    "            ['item_id', 'store_id']\n",
    "            ]\n",
    "\n",
    "for col in icols:\n",
    "    print('Encoding', col)\n",
    "    col_name = '_'+'_'.join(col)+'_'\n",
    "    grid_df['enc'+col_name+'mean'] = grid_df.groupby(col)['sales'].transform('mean').astype(np.float16)\n",
    "    grid_df['enc'+col_name+'std'] = grid_df.groupby(col)['sales'].transform('std').astype(np.float16)\n",
    "\n",
    "keep_cols = [col for col in list(grid_df) if col not in base_cols]\n",
    "grid_df = grid_df[['id','d']+keep_cols]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Save Mean/Std encoding\n"
     ]
    }
   ],
   "source": [
    "#################################################################################\n",
    "print('Save Mean/Std encoding')\n",
    "grid_df.to_pickle(processed_data_dir+'mean_encoding_df.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "hide_input": false,
  "kernelspec": {
   "display_name": "conda_mxnet_p36",
   "language": "python",
   "name": "conda_mxnet_p36"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
